Note: Grading is based both on your graphs and verbal explanations. Follow all best practices as discussed in class, including choosing appropriate parameters for all graphs. Do not expect the assignment questions to spell out precisely how the graphs should be drawn. Sometimes guidance will be provided, but the absense of guidance does not mean that all choices are ok.

Read Graphical Data Analysis with R, Ch. 4, 5

1. House features

[5 points]

Data: ames in the openintro package

  1. Create a frequency bar chart for the roof styles of the properties.
library(tidyverse)
library(openintro)
ggplot(ames, aes(fct_infreq(Roof.Style))) +
  geom_bar() +
  scale_y_continuous(expand = expansion(mult = c(0, .05))) +
  xlab("Roof Style")

  1. Create a frequency bar chart for the variable representing the month in which the property was sold.
ggplot(ames, aes(factor(month.abb[Mo.Sold], levels = month.abb))) +
  geom_bar() +
  scale_y_continuous(expand = expansion(mult = c(0, .05))) +
  xlab("")

  1. List all the factor variables that have "Ex" "Fa" "Gd" "Po" "TA" as levels.
myvars <- map(select(ames, where(is.factor)), levels) %>%
  map_df(paste, collapse = " ") %>%
  pivot_longer(everything()) %>%
  mutate(value = trimws(value)) %>%
  filter(value == "Ex Fa Gd Po TA") %>%
  pull(name)
myvars
## [1] "Exter.Cond"   "Bsmt.Qual"    "Bsmt.Cond"    "Heating.QC"   "Kitchen.Qual"
## [6] "Fireplace.Qu" "Garage.Qual"  "Garage.Cond"
  1. Create faceted bar charts using facet_wrap() to display the frequency distribution of all variables from part c). (Hint: transform the data first with pivot_longer())
ames %>%
  select(all_of(myvars)) %>%
  pivot_longer(everything()) %>%
  mutate(value = na_if(value, "")) %>% 
  mutate(value = fct_recode(value, "Excellent" = "Ex", "Good" = "Gd",
                            "Typical" = "TA", "Fair" = "Fa", "Poor" = "Po")) %>%
  mutate(value = fct_relevel(value, "Poor", "Fair", "Typical", "Good", "Excellent")) %>%
  ggplot(aes(value)) +
  geom_bar() +
  facet_wrap(~name, ncol = 2)

2. Pet names

[10 points]

Data: seattlepets in the openintro package

  1. Create separate Cleveland dot plots for the 30 most popular dog names and 30 most popular cat names.
pets <- seattlepets
pets %>%
  filter(species == "Dog") %>%
  group_by(animal_name) %>%
  count() %>%
  ungroup() %>%
  slice_max(order_by = n, n = 30) %>%
  ggplot(aes(n, reorder(animal_name, n))) +
  geom_point() +
  ggtitle("Top Dog Names in Seattle") + 
  theme_linedraw()

pets %>%
  filter(species == "Cat") %>%
  group_by(animal_name) %>%
  count() %>%
  ungroup() %>%
  slice_max(order_by = n, n = 30)  %>%
  mutate(animal_name = fct_reorder(animal_name, n)) %>% 
  mutate(animal_name = fct_explicit_na(animal_name, "NA")) %>% 
  mutate(animal_name = fct_relevel(animal_name, "NA", )) %>% 
  ggplot(aes(n, animal_name)) +
  geom_point() +
  ggtitle("Top Cat Names in Seattle") + 
  theme_linedraw()

  1. Use a Cleveland dot plot to display the 30 names that are the most “dog” measured by the proportion of all animals with that name that are dogs. (You can remove goat and pig names from the dataset.) Clearly state any decisions you make about what to include and not include and explain your reasoning.
propdogs <- pets %>%
  filter(species %in% c("Cat", "Dog")) %>%
  group_by(animal_name, species) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  pivot_wider(names_from = "species", values_from = "n") %>%
  mutate(cat = replace_na(Cat, 0), dog = replace_na(Dog, 0), .keep = "unused") %>%
  mutate(total = dog + cat) %>% 
  mutate(prop = dog / total)
  
propdogs %>%
  filter(total >= 25) %>%
  slice_max(order_by = prop, n = 30) %>%
  ggplot(aes(x = prop, y = reorder(animal_name, prop))) +
  geom_point() +
  ggtitle("Animal names that say 'dog'",
          sub = "(among names with counts of at least 25)") +
  ylab("") +
  theme_linedraw()

We used a cutoff of 25 for the names that we choose to include. 25 is somewhat arbitrary – the main point is not to include those with a very small number of occurrences in the dataset.

  1. Find the 30 most popular names for dogs and cats combined, and create a multidot Cleveland dot plot showing the counts for dogs, cats, and total for each of these 30 names. (One color for dogs, one color for cats, one color for total.) Order the dots by the total count.
pets <- seattlepets
top30names <- pets %>%
  filter(species %in% c("Cat", "Dog")) %>%
  group_by(animal_name) %>%
  summarize(n = n()) %>%
  slice_max(order_by = n, n = 30) %>%
  pull(animal_name)

top30byspecies <- pets %>%
  filter(animal_name %in% top30names) %>%
  filter(species %in% c("Cat", "Dog")) %>%
  group_by(animal_name, species) %>%
  summarize(count = n()) %>%
  ungroup()

top30total <- top30byspecies %>%
  group_by(animal_name) %>%
  summarize(count = sum(count)) %>%
  mutate(species = "Total")

top30 <- bind_rows(top30byspecies, top30total) %>% 
  mutate(animal_name = fct_reorder2(animal_name, species == "Total", -count)) %>% 
  mutate(animal_name = fct_explicit_na(animal_name, "NA")) %>% 
  mutate(animal_name = fct_relevel(animal_name, "NA"))
  
ggplot(top30, aes(x = count, y = animal_name, color = species)) +
  geom_point() +
  ggtitle("Top 30 Pet Names Overall", sub = "dogs and cats combined") +
  ylab("") +
  theme_linedraw() +
  theme(legend.position = "top")

  1. Create a scatterplot of popular cat names vs. popular dog names. Clearly some names are more “dog” names and some are more “cat” names. Decide on a metric for defining what is a “dog” name, a “cat” name, and a “neutral” name and state it explicity. What is your metric?
scatterpets <- pets %>%
  filter(species %in% c("Cat", "Dog")) %>%
  group_by(animal_name, species) %>%
  summarize(count = n()) %>%
  ungroup() %>%
  pivot_wider(names_from = "species", values_from = "count") %>%
  mutate(Cat = replace_na(Cat, 0), Dog = replace_na(Dog, 0))

cat2dogratio = sum(scatterpets$Cat)/sum(scatterpets$Dog)

g <- scatterpets %>%
  filter(!is.na(animal_name)) %>% 
  ggplot(aes(Dog, Cat)) +
  geom_point(size = .5) +
  geom_abline(slope = cat2dogratio, intercept = 0) +
  ggtitle("Cat vs. Dog Names", subtitle = "(the line represents cat to dog name ratios equal to that of the full population)") +
  xlab("Number of Dogs") +
  ylab("Number of Cats") +
  theme_linedraw()
g

Solution

The graph above shows the counts of cat names plotted against the counts of dog names. (I have removed the NA value so it’s easier to see the majority of the data). Points that fall on the line above represent neutral names, as the slope of the line is equal to the ratio overall of cats to dogs in the dataset. Points that fall above the line are more “cat” names; points that fall below the line are more “dog” names. Let’s look at a small section of the graph:

g + geom_text(aes(label = ifelse(Dog == 32, animal_name, "")),
              size = 2.5, hjust = 0, nudge_x = .1) +
  scale_x_continuous(limits = c(30, 40), breaks = 30:40) + 
  scale_y_continuous(limits = c(0, 40))

We see that Cosmo is a neutral name. There are 16 cats and 32 dogs with the name; the ratio is very close to the overall percentage of cats to dogs in the dataset: 0.4915722. The name above the line, Callie, is more “cat,” while the names below the line–Lexi, Finnegan, Bo, Kuma, Roxie, and Dixie–being more “dog.” Of those, Dixie is the “doggiest”: there are 32 dogs but only 2 cats named Dixie.

Since there are very few names that fall as close to the line as Cosmo, we will create a buffer around the line to create a neutral name territory.

After experimenting with a number of options, I found that adding lines which create angles of .12 radians (about 6.9 degrees) on either side of the original diagonal and adding or subtracting a small intercept creates a reasonable looking neutral territory.

angle <- atan(cat2dogratio)
catslope <- tan(angle + .12)
dogslope <- tan(angle - .12)
catintercept <- 3/cos(angle+.12)
dogintercept <- -3/cos(angle-.12)

g +
  geom_abline(slope = catslope, intercept = catintercept) +
  geom_abline(slope = dogslope, intercept = dogintercept) +
  coord_fixed() +
  scale_y_continuous(limits = c(0, 200))

  1. Create a new variable for type of name (“dog”, “cat” or “neutral”) and redraw the scatterplot coloring the points by this variable. Label individual points as you see fit (don’t label all of them.)

Next we need to determine the region for each point and label some points. Labeling names with at least 35 cats or 100 dogs captured most of the very popular names without overlapping labels.

library(ggrepel)
scatterpets <- scatterpets %>% 
  mutate(name_type = case_when(
    Cat > catslope*Dog + catintercept ~ "Cat",
    Cat < dogslope*Dog + dogintercept ~ "Dog",
    TRUE ~ "Neutral"
  )) %>% 
  mutate(name_type = fct_relevel(name_type, "Cat", "Neutral", "Dog"))

linecolor <- "grey50"

scatterpets %>%
  filter(!is.na(animal_name)) %>%
  mutate(label = ifelse(Cat > 35 | Dog > 100, animal_name, "")) %>%
  ggplot(aes(Dog, Cat, color = name_type, label = label)) +
  geom_point(size = .5) +
  geom_abline(slope = catslope, intercept = catintercept, color = linecolor) +
  geom_abline(slope = dogslope, intercept = dogintercept, color = linecolor) +
  geom_text_repel(size = 2.5, nudge_y = 10, show.legend = FALSE) +
  scale_y_continuous(limits = c(0, 200)) +
  ggtitle("Cat vs. Dog Names") +
  xlab("Number of Dogs") +
  ylab("Number of Cats") +
  theme_linedraw() +
  theme(legend.position = "top")

Alternative Solution

One way to think about how much a name is a “dog” name is to consider the probability of getting the observed proportion of dogs for a given name if there were no connection between name and species. For example, there are 126 pets named “Riley.” Since 67.04% of all of the pets are dogs we would expect about .6704 * 126 = 84.47 of the Rileys to be dogs. However the actually number of dog Rileys is 117. We can set up a hypothesis test to find the probability of getting a result this far (or more) from what’s expected.

We set up a null hypothesis: \(p = .6704\) and then find \(P(\hat{p} > 117/126)\) (and then double it to find the p-value for a two-sided test). We can use binom.test() to get the p-value directly:

binom.test(117, 126, .6704)
## 
##  Exact binomial test
## 
## data:  117 and 126
## number of successes = 117, number of trials = 126, p-value = 6.623e-12
## alternative hypothesis: true probability of success is not equal to 0.6704
## 95 percent confidence interval:
##  0.8687359 0.9668197
## sample estimates:
## probability of success 
##              0.9285714

Since the p-value is very small it’s very unlikely that just by chance a lot of dogs are named Riley. Using this method, I found the 30 names with the lowest p-values, in other words, the ones that provide the most evidence against the null hypothesis that a cat or dog is equally likely to have that name.

For this graph, we labeled names with the lowest p-values, that is, the most “doggiest” and “cattiest” names:

bt <- function(cat, dog) binom.test(x = dog, n = cat + dog, p = .67)$p.value

scatterpets <- pets %>%
  filter(!is.na(animal_name)) %>% 
  filter(species %in% c("Cat", "Dog")) %>%
  group_by(animal_name, species) %>%
  summarize(count = n()) %>%
  ungroup() %>%
  pivot_wider(names_from = "species", values_from = "count") %>%
  mutate(Cat = replace_na(Cat, 0), Dog = replace_na(Dog, 0)) %>%
  mutate(animal_name = replace_na(animal_name, "NA")) %>%
  mutate(pvalue = map2_dbl(Cat, Dog, bt)) %>%
  mutate(Type = case_when(pvalue <= .05 & Dog > 2*Cat ~ "Dog",
                          pvalue <= .05 & 2*Cat > Dog ~ "Cat",
                          TRUE ~ "Neutral")) %>% 
  mutate(Type = fct_relevel(Type, "Cat", "Neutral", "Dog"))

  
scatterpets %>%
  mutate(label = ifelse(pvalue <= .00001, animal_name, "")) %>%
  ggplot(aes(Dog, Cat, label = label, color = Type)) +
  geom_point(size = .5) +
  geom_text(size = 2.5, nudge_y = 10, show.legend = FALSE) +
  ggtitle("Cat vs. Dog Names") +
  xlab("Number of Dogs") +
  ylab("Number of Cats") +
  theme_linedraw() +
  theme(legend.position = "top")

And a closer look at names with fewer than 25 cats and fewer than 25 dogs:

scatterpets %>%
  mutate(label = ifelse(Cat > 18, animal_name, "")) %>% 
  ggplot(aes(Dog, Cat, label = label, color = Type)) +
  geom_point() +
  geom_text(size = 2.5, nudge_x = .5, hjust = 0, show.legend = FALSE) +
  coord_equal() +
  scale_x_continuous(limits = c(0, 25)) +
  scale_y_continuous(limits = c(0, 25)) +
  ggtitle("Cat vs. Dog Names") +
  xlab("Number of Dogs") +
  ylab("Number of Cats") +
  theme_linedraw()

Note the skew toward cat names for low frequency names. This makes sense there are half as many cats in the dataset as dogs.

  1. What are your most interesting discoveries from this dataset?

Many more cats are missing names than dogs: NA is the most popular cat “name” whereas NA does not make the top 30 for dogs. It’s not clear if the cats labeled NA in the dataset don’t have names or if their names are missing for some reason(s).

There are about twice as many dogs as cats. Since the helpfile for the dataset describes it as “Names of registered pets in Seattle, WA, between 2003 and 2018, provided by the city’s Open Data Portal” we can infer that this is the actual ratio of dogs to cats as pets in Seattle during this period.

Even taking into consideration that there are more dogs, the very popular dog names are more common proportionally than the very popular cat names. For example, 9.6 of every 1000 dogs are named Lucy (the most popular dog name) whereas only 6.6 of every 1000 cats are named Luna (the most pouplar cat name).

To a somewhat surprising degree, most names are given both to cats and dogs. Lucy is the #1 dog name AND the #2 cat name. Other very popular names given to dogs and cats include: Charlie, Luna, Bella, Max, Daisy, Molly, Jack, Lily and Stella. All of these names appear at least 225 times in the dataset.

In contrast, the maximum number of dogs with a single dog-only name is 16. Kodiak, Odio, and Oso are tied for first place in this category. Similarly, after NA the most popular cat-only name is Mittens, given to 19 cats.

3. House sizes and prices

[7 points]

Data: ames in the openintro package

For all, adjust parameters to the levels that provide the best views of the data.

Draw four plots of price vs. area with the following variations:

  1. Scatterplot – adjust point size and alpha.
ggplot(ames, aes(area, price/1000)) +
  geom_point(size = 1, alpha = .5, stroke = 0) +
  ylab("price (in 1000s of U.S. $)") +
  theme_classic()

  1. Scatterplot with density contour lines
ggplot(ames, aes(area, price/1000)) +
  geom_point(alpha = .5, stroke = 0, color = "grey70") +
  geom_density_2d() +
  ylab("price (in 1000s of U.S. $)") +  
  theme_classic(14)

  1. Hexagonal heatmap of bin counts
# RColorBrewer::brewer.pal(9, "Blues")
# [1] "#F7FBFF" "#DEEBF7" "#C6DBEF" "#9ECAE1" "#6BAED6" "#4292C6" "#2171B5" "#08519C" "#08306B"
ggplot(ames, aes(area, price/1000)) +
  geom_hex(bins = 50) +
  scale_fill_gradient(low = "#DEEBF7", high = "#08519C") + 
  ylab("price (in 1000s of U.S. $)") +
  theme_classic()

  1. Square heatmap of bin counts
ggplot(ames, aes(area, price/1000)) +
  geom_bin2d(bins = 50) +
  scale_fill_gradient(low = "#DEEBF7", high = "#08519C") +
  ylab("price (in 1000s of U.S. $)") +
  theme_classic(14)

  1. Describe noteworthy features of the data, using the “Movie ratings” example on page 82 (last page of Section 5.3) as a guide.

There are three clear outliers with high area–around 5000 square feet–and low prices (around $200k).

There are two outliers in both dimensions: high area and high price: area > 4000 square feet and price > $700k.

There is a collection of mid/large sized houses–between 2500 and 3000 square feet– with very high prices. They form a ring at the $600k price line.

There are no small area houses with high prices.

Although there’s a lot of overplotting, it is possible to see from the heatmaps and plot with density contour lines that houses with areas between 1000 and 1500 square feet with prices between $100k and $200k occur the most often in the dataset.

The price variance increases as area increases. For example, houses with areas around 3000 square feet had prices between about $200k and $600k, quite a large range.

There appears to be a straight line border with a positive slope above the collection of points in the scatterplot. It serves as an upper limit of price for a particular area. For example, no houses with area of 1000 square feet had a price above $200k and no houses with an area of 2000 square feet had a price above $400k. There is no comparable boundary on the lower limit of price. In other words, there are some large houses with quite low prices.

4. Correlations

[8 points]

Data: ames in the openintro package

  1. Recreate the scatterplot from part 3 (price vs. area) this time faceting on Neighborhood (use facet_wrap(). Add best fitting lines and sort the facets by the slope of the best fitting line from low to high. (Use lm() to get the slopes.)
cor_df <- ames %>%
  group_by(Neighborhood) %>%
  summarize(cor = cor(area, price), beta = lm(price~area)$coefficients[2], meanprice = mean(price)) %>%
  ungroup() %>%
  arrange(beta) 

ames %>%
  ggplot(aes(area/1000, price/1000)) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(size = .15) +
  facet_wrap(~fct_relevel(Neighborhood, as.character(cor_df$Neighborhood))) +
  xlab("living area (in 1000s of square feet)") +
  ylab("price (in 1000s of U.S. $)") +
  theme_linedraw()

  1. Is the slope higher in neighborhoods with higher mean housing prices? Present graphical evidence and interpret in the context of this data.
ggplot(cor_df, aes(meanprice, beta, label = Neighborhood)) +
  geom_point() +
  geom_text(nudge_y = 10, size = 3) +
  geom_smooth(method="lm", se= FALSE)

It seems that there are higher slopes in neighborhoods with higher mean housing prices, since this graph has a positive slope.

  1. Repeat parts a) and b) with the following adjustment: order the faceted plots by \(R^2\) from the linear regression of price on area by Neighborhood. Is the \(R^2\) higher in neighborhoods with higher mean housing prices? Are the results the same for slope and \(R^2\)? Explain using examples from the graphs.
r2_df <- cor_df %>% arrange(cor^2)
ames %>%
  ggplot(aes(area/1000, price/1000)) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_point(size = .15) +
  facet_wrap(~fct_relevel(Neighborhood, as.character(r2_df$Neighborhood))) +
  xlab("living area (in 1000s of square feet)") +
  ylab("price (in 1000s of U.S. $)") +
  theme_linedraw()

r2_df$rsq = cor_df$cor^2
ggplot(r2_df, aes(meanprice, rsq, label = Neighborhood)) +
  geom_point() +
  geom_text(nudge_y = .04, size = 3) +
  geom_smooth(method="lm", se= FALSE)

It seems that the \(R^2\) is higher in neighborhoods with higher mean housing prices, since this graph has a positive slope.

The results are not the same for slope and \(R^2\). For example, Crawford has a high slope but low \(R^2\). This is because there is a lot of variance of price for similar areas.